{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "95SYNAMCZYsB"
      },
      "source": [
        "***Copyright 2020 Google LLC.***\n",
        "\n",
        "Licensed under the Apache License, Version 2.0 (the \"License\");"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aqZAWIB9Zc4a"
      },
      "source": [
        "Author: Furkan Kocayusufoglu  \n",
        "Term: Summer 2020 Research Internship with Mixel/Brain  \n",
        "Purpose: This notebook analyzes user history and time resolution within such histories. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Pirvljq39Gjw"
      },
      "outputs": [],
      "source": [
        "import tensorflow.compat.v1 as tf\n",
        "from collections import defaultdict\n",
        "from datetime import datetime\n",
        "import copy\n",
        "import random\n",
        "import json\n",
        "import string\n",
        "import os\n",
        "import numpy as np\n",
        "import itertools\n",
        "import matplotlib. pyplot as plt "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oCxCrNvS-eKf"
      },
      "outputs": [],
      "source": [
        "\"\"\" Read data. For each label, we store a dictionary of users where \n",
        "key:user_id and value: list of \u003citem id, timestamp\u003e tuples.\"\"\"\n",
        "base_dir = 'path/to/your/processed/data'\n",
        "dataset_categories = ['Kindle_Store', 'CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Movies_and_TV', 'Video_Games', 'Pet_Supplies']\n",
        "labels=['Kindle', 'CDs', 'Food', 'Movies', 'Games', 'Pets']\n",
        "data = {label: []  for label in labels}\n",
        "user_lists = {label: defaultdict(list) for label in labels}\n",
        "for dataset_category, label in zip(dataset_categories, labels):\n",
        "  dataset_path = '{}{}_user_item_query_time_mapped.txt'.format(base_dir, dataset_category)\n",
        "  # Read user,item,query,time\n",
        "  with tf.gfile.Open(dataset_path, \"r\") as f:\n",
        "    for line in f:\n",
        "      u, i, _, t = [int(x) for x in line.rstrip().split(\" \")]  # Ignores query.\n",
        "      user_lists[label][u].append([i, t])\n",
        "\n",
        "  print(\"Category {} is done. Number of users = {}\".format(dataset_category, len(user_lists[label])))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iUip6qyYB-qX"
      },
      "outputs": [],
      "source": [
        "\"\"\" Helper functions for timestamp.\"\"\"\n",
        "def convert_to_date(timestamp):\n",
        "  date_time = datetime.fromtimestamp(timestamp)\n",
        "  return date_time.strftime(\"%m/%d/%Y\")  # , %H:%M:%S\n",
        "\n",
        "def time_delta(ts1, ts2):\n",
        "  if ts1 \u003e ts2:\n",
        "    return (datetime.fromtimestamp(ts1) - datetime.fromtimestamp(ts2)).days\n",
        "  else:\n",
        "    return (datetime.fromtimestamp(ts2) - datetime.fromtimestamp(ts1)).days"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "flBVOWmn_i4X"
      },
      "outputs": [],
      "source": [
        "\"\"\" Sampling random users from Kindle category for visual inspection.\"\"\"\n",
        "for idx in random.sample(range(1, 1000), 10):\n",
        "  print([(i, convert_to_date(t), t) for i, t in user_lists['Kindle'][idx]])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "u_ZGvaJHYf3s"
      },
      "outputs": [],
      "source": [
        "\"\"\" Plotting temporal resolution of the user history. \"\"\"\n",
        "sampled_users = []\n",
        "num_users_sample = 100\n",
        "user_len_sample = 20\n",
        "while len(sampled_users) \u003c num_users_sample:\n",
        "  idx = random.sample(range(1, len(user_lists['Kindle'])), 1)[0]\n",
        "  if len(user_lists['Kindle'][idx]) != user_len_sample:\n",
        "    continue\n",
        "  sampled_users.append(user_lists['Kindle'][idx])\n",
        "\n",
        "num_users_plot = 5\n",
        "plot_users = random.sample(sampled_users, num_users_plot)\n",
        "bins = [int(i) for i in range(1, user_len_sample)]\n",
        "idx = 0\n",
        "\n",
        "plt.figure(figsize=(10, 8))\n",
        "plt.xlabel('Time period (days)', fontsize=20)\n",
        "plt.xticks(fontsize=20)\n",
        "plt.ylabel('User interactions', fontsize=20)\n",
        "plt.yticks(bins, fontsize=20)\n",
        "\n",
        "for u in plot_users:\n",
        "  last_t = plot_users[-1][-1][1]\n",
        "  time_deltas = list(sorted([time_delta(last_t, t) for i, t in u[:-1]]))\n",
        "  plt.scatter(time_deltas, bins, label = 'user {}'.format(idx+1))\n",
        "  idx += 1\n",
        "\n",
        "plt.legend()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1rq3YghkmVUj"
      },
      "outputs": [],
      "source": [
        "\"\"\" Compute the average time spent between two consecutive items. \"\"\"\n",
        "days_list = {label: [] for label in labels}\n",
        "for label in labels:\n",
        "  for user, items in user_lists[label].items():\n",
        "    nxt_item = items[-1]\n",
        "    for item in reversed(items[:-1]):\n",
        "      time_d = time_delta(nxt_item[1], item[1])\n",
        "      days_list[label].append(time_d)\n",
        "      nxt_item = item\n",
        "\n",
        "  print(\"Mean time spent between two consecutive items {} for {} (in days)\".format(np.mean(days_list[label]), label))\n",
        "  print(\"Median time spent between two consecutive items {} for {} (in days)\".format(np.median(days_list[label]), label))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QyKzmR41uOui"
      },
      "outputs": [],
      "source": [
        "\"\"\" Compute the cumulative ratio of items vs time window (in days) with respect\n",
        " to last item per user. \"\"\"\n",
        "max_day = 365*2  # Consider up to 2 years\n",
        "days_dict  = {label: defaultdict(lambda: []) for label in labels}\n",
        "for label in labels:\n",
        "  for user, items in user_lists[label].items():\n",
        "    last_item = items[-1]\n",
        "    user_time_dict = {i:0 for i in range(max_day)}\n",
        "    scale = len(items) - 1\n",
        "    for item in reversed(items[:-1]):\n",
        "      time_d = time_delta(last_item[1], item[1])\n",
        "      if time_d not in user_time_dict:\n",
        "        break\n",
        "      user_time_dict[time_d] += 1/scale\n",
        "    sum_so_far = 0\n",
        "    for key in sorted(user_time_dict.keys()):\n",
        "      user_time_dict[key] += sum_so_far\n",
        "      sum_so_far = user_time_dict[key]\n",
        "  \n",
        "    for key, value in user_time_dict.items():\n",
        "      days_dict[label][key].append(value)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-hzizWCIY1aO"
      },
      "outputs": [],
      "source": [
        "# Sanity check\n",
        "len(days_dict['Pets'][0]) == len(days_dict['Pets'][100]) == len(days_dict['Pets'][500])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "abDLSiWAakKB"
      },
      "outputs": [],
      "source": [
        "\"\"\" Compute the (non-overlapping) segment densities with respect to different \n",
        "    segment allocations. \n",
        "\"\"\"\n",
        "\n",
        "def _segment_boundary_exp(b, i):\n",
        "  \"\"\" Computes the end of segment i wrt to exponential with base b. The formula simply is: \n",
        "      t = b^i\n",
        "  \"\"\"\n",
        "  return b**i\n",
        "\n",
        "def _segment_boundary_pow(b, i):\n",
        "  \"\"\" Computes the end of segment i wrt to power law with base b. The formula simply is: \n",
        "      t = i**b\n",
        "  \"\"\"\n",
        "  return i**b\n",
        "\n",
        "def _segment_boundary_lin(b, i):\n",
        "  \"\"\" Computes the end of segment i wrt to linear with base b. The formula simply is: \n",
        "      t = b*i\n",
        "  \"\"\"\n",
        "  return b*i"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "g00kVpEzSieJ"
      },
      "outputs": [],
      "source": [
        "bases_exp = [3, 4, 5, 8]\n",
        "num_segments = 10\n",
        "max_day = 365*2  # Consider up to 2 years.\n",
        "segment_analysis_dict_exp = {label:{} for label in labels}\n",
        "for label in labels:\n",
        "  for base in bases_exp:\n",
        "    segment_boundaries = [_segment_boundary_exp(base, i) for i in range(num_segments)]\n",
        "    segment_densities = [0] * num_segments\n",
        "    for user, items in user_lists[label].items():\n",
        "      last_item = items[-1]\n",
        "      scale = len(items) - 1\n",
        "      active_segment_idx = 0\n",
        "      for item in reversed(items[:-1]):\n",
        "        time_d = time_delta(last_item[1], item[1])\n",
        "        if time_d \u003e max_day:\n",
        "          break\n",
        "        while time_d \u003e segment_boundaries[active_segment_idx] and active_segment_idx \u003c (num_segments - 1):\n",
        "          active_segment_idx += 1\n",
        "\n",
        "        segment_densities[active_segment_idx] += 1/scale\n",
        "\n",
        "    segment_analysis_dict_exp[label][base] = np.array(segment_densities) /len(user_lists[label])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QD8blTEv_m1a"
      },
      "outputs": [],
      "source": [
        ""
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "last_runtime": {
        "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
        "kind": "private"
      },
      "name": "user_history_time_analysis.ipynb",
      "provenance": [
        {
          "file_id": "1sFzxaLCscoHYdwdovmgT9B6YTT6RwL22",
          "timestamp": 1600985957315
        }
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
